#!/usr/bin/env python

'''
Takes a text dump from pipermail archive
pickles a list of dictionaries:
    [{'fromline':' ',
      'dateline':' ',
      'subjectline':' ',
      'messageline':' '}, 
      ..., 
      {lastmessage}
    ]
'''

import pickle

data_file = '/.../nb.txt'
htmldelim = '-------------- next part --------------'
message_container = []
smtp = {}
message = ''
reached_message_id = False

with open(data_file, 'r') as f:
    for line in f:
        if htmldelim in line and '>' not in line:
            smtp['messageline'] = message
            if smtp.has_key('fromline'):
                message_container.append(smtp)
            message = ''
            smtp = {}
            reached_message_id = False
            continue
        if 'From:' in line and '>' not in line:
            if smtp.has_key('fromline'):
                smtp['messageline'] = message
                message_container.append(smtp)
                message = ''
                smtp = {}
                reached_message_id = False
            smtp['fromline'] = line.split('From:')[-1].strip()
            continue
        if 'Date:' in line and '>' not in line:
            dateline = line.split('Date:')[-1].strip()
            if smtp.has_key('dateline') == False:
                smtp['dateline'] = dateline
            continue
        if 'Subject:' in line and '>' not in line:
            smtp['subjectline'] = line.split('Subject:')[-1].strip()
            continue
        if 'Message-ID:' in line:
            reached_message_id = True
            continue
        if reached_message_id:
            message += line

pickle.dump(message_container, open('data/discuss.p', 'wb'))

